How to do it?:
Open the Rmarkdown file of this assignment (link) in Rstudio.
Right under each question, insert a code chunk
(you can use the hotkey Ctrl + Alt + I to add a code chunk)
and code the solution for the question.
Knit the rmarkdown file (hotkey:
Ctrl + Alt + K) to export an html.
Publish the html file to your Githiub Page.
Submission: Submit the link on Github of the assignment to Canvas
Input: a data frame
Output: a data frame with all the missing of numeric variables replaced by the associated means.
Hint: Similar function
library(tidyverse)
setwd("C:/Users/student/Downloads")
df <- read_csv('adult_census_missing.csv')
mean_impute <- function(x)
{
if(is.numeric(x))
{
# Find the mode of x
mean_of_x <- mean(x, na.rm = TRUE)
# Replace the missing by the mode
library(tidyr)
x <- replace_na(x, mean_of_x)
}
return(x)
}
numeric_impute <- function(d)
{
for (i in 1:length(d))
{
d[[i]] <- mean_impute(d[[i]])
}
return(d)
}
colSums(is.na(df))
## age workclass fnlwgt education education.num
## 30 34 0 15 0
## marital.status occupation relationship race sex
## 26 35 0 0 24
## capital.gain capital.loss hours.per.week native.country income
## 8 0 0 15 0
d1 <- numeric_impute(df)
colSums(is.na(d1))
## age workclass fnlwgt education education.num
## 0 34 0 15 0
## marital.status occupation relationship race sex
## 26 35 0 0 24
## capital.gain capital.loss hours.per.week native.country income
## 0 0 0 15 0
Input: a data frame
Output: a data frame with all the missing of variables replaced by the associated means (for numeric variables) or modes (for non-numeric variables).
Hint: Use If-statement to combine the function in Problem 1 and the function in this example
mode_impute <- function(x)
{
if(is.numeric(x))
{
# Find the mode of x
mean_of_x <- mean(x, na.rm = TRUE)
# Replace the missing by the mode
library(tidyr)
x <- replace_na(x, mean_of_x)
}
else if (!is.numeric(x))
{
# Find the mode of x
mode_of_x <- names(sort(-table(x)))[1]
# Replace the missing by the mode
library(tidyr)
x <- replace_na(x, mode_of_x)
}
return(x)
}
numeric_impute <- function(d)
{
for (i in 1:length(d))
{
d[[i]] <- mode_impute(d[[i]])
}
return(d)
}
colSums(is.na(df))
## age workclass fnlwgt education education.num
## 30 34 0 15 0
## marital.status occupation relationship race sex
## 26 35 0 0 24
## capital.gain capital.loss hours.per.week native.country income
## 8 0 0 15 0
d1 <- numeric_impute(df)
colSums(is.na(d1))
## age workclass fnlwgt education education.num
## 0 0 0 0 0
## marital.status occupation relationship race sex
## 0 0 0 0 0
## capital.gain capital.loss hours.per.week native.country income
## 0 0 0 0 0
Input: a data frame
Output: Bar plots of all non-numeric variables
Hint: Similar function
density_plot <- function(d)
{
library(ggplot2)
for (i in 1:length(d))
{
if (!is.numeric(d[[i]]))
{
print(ggplot(d, aes(x = d[[i]]))+
geom_bar(position='fill')+
labs(x = names(d)[i]))
}
}
}
density_plot(df)
Input: a data frame
Output: all possible the bar plots of a non-numeric variable filled by a non-numeric variable.
Hint: Similar function
density_plot2 <- function(d)
{
library(ggplot2)
l <- length(d)
for (i in 1:(l-1))
for (j in (i+1):l)
{
if (!is.numeric(d[[i]])& (!is.numeric(d[[j]])))
{
print(ggplot(d, aes(x = d[[i]], fill = d[[j]]))+
geom_bar()+labs(x = names(d)[i], fill = names(d)[j]))
}
}
}
density_plot2(df)
Input: a data frame
Output:
all possible the bar plots of a non-numeric variable filled by a non-numeric variable.
all possible the density plots of a numeric variable colored by a non-numeric variable
all possible the scatter plots.
Hint: Combine this
function, this
function, and the function in Question 4. One way to combine is
creating a new function, quick_plot, and call these three
functions within quick_plot.
density_plot3 <- function(d)
{
library(ggplot2)
l <- length(d)
for (i in 1:(l-1))
for (j in (i+1):l)
{
if (!is.numeric(d[[i]])& (!is.numeric(d[[j]])))
{
print(ggplot(d, aes(x = d[[i]], fill = d[[j]]))+
geom_bar()+labs(x = names(d)[i], fill = names(d)[j]))
}
else if (is.numeric(d[[i]])& (!is.numeric(d[[j]])))
{
print(ggplot(d, aes(x = d[[i]], color = d[[j]]))+
geom_density()+labs(x = names(d)[i], color = names(d)[j]))
}
else if (is.numeric(d[[i]])& (is.numeric(d[[j]])))
{
print(ggplot(d, aes(x = d[[i]], y = d[[j]]))+
geom_point(position='dodge')+labs(x = names(d)[i], y = names(d)[j]))
}
}
}
density_plot3(df)
## Warning: Removed 30 rows containing non-finite values (`stat_density()`).
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Warning: Removed 30 rows containing missing values (`geom_point()`).
## Warning: Removed 30 rows containing non-finite values (`stat_density()`).
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Removed 30 rows containing missing values (`geom_point()`).
## Warning: Removed 30 rows containing non-finite values (`stat_density()`).
## Warning: Removed 30 rows containing non-finite values (`stat_density()`).
## Warning: Removed 30 rows containing non-finite values (`stat_density()`).
## Warning: Removed 30 rows containing non-finite values (`stat_density()`).
## Warning: Removed 30 rows containing non-finite values (`stat_density()`).
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Warning: Removed 38 rows containing missing values (`geom_point()`).
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Warning: Removed 30 rows containing missing values (`geom_point()`).
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Removed 30 rows containing missing values (`geom_point()`).
## Warning: Removed 30 rows containing non-finite values (`stat_density()`).
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 30 rows containing non-finite values (`stat_density()`).
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Warning: Removed 8 rows containing missing values (`geom_point()`).
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Warning: Removed 8 rows containing missing values (`geom_point()`).
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Warning: Removed 8 rows containing missing values (`geom_point()`).
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Removed 8 rows containing missing values (`geom_point()`).
## Warning: Removed 8 rows containing non-finite values (`stat_density()`).
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 8 rows containing non-finite values (`stat_density()`).
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
## Warning: Groups with fewer than two data points have been dropped.
## no non-missing arguments to max; returning -Inf
## Warning: Groups with fewer than two data points have been dropped.
## no non-missing arguments to max; returning -Inf